# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import ttest_ind

def select_knn(X, y, scorer, cv=5, verbose=True):
    """"筛选kNN算法的最合适参数k"""
    grid = {'n_neighbors':[3, 5, 7, 9, 11, 13, 15]}
      
    grid_search = GridSearchCV(KNeighborsRegressor(),\
                                param_grid=grid,
                                cv=cv,
                                scoring=scorer,
                                n_jobs=-1)
    
    grid_search.fit(X, y)
    if verbose:
        print('kNN 最佳参数： ', grid_search.best_params_)
    return grid_search.best_params_

def select_svr(X, y, scorer, cv=5, verbose=True):
    '''
    选择 SVR 最合适参数
    '''
    
    grid = {
            'C':[0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3],
            'kernel':['linear','rbf','poly'],
            'epsilon':[0, 0.01, 0.05, 0.1]
            }

    grid_search = GridSearchCV(SVR(),\
                                param_grid=grid,
                                cv=cv,
                                scoring=scorer,
                                n_jobs=-1)
    
    grid_search.fit(X, y)
    if verbose:
        print('SVR 最佳参数： ', grid_search.best_params_)
    return grid_search.best_params_

def select_dtr(X, y, scorer, cv=5, verbose=True):
    '''
    筛选决策树的最佳参数
    '''
    grid = {'max_depth':[4, 9, 13, 17, 21, 25],\
            'ccp_alpha':[0,0.00025,0.0005,0.001,0.00125,0.0015,0.002,0.005,0.01,0.05,0.1]} 
    grid_search = GridSearchCV(DecisionTreeRegressor(),
                                param_grid=grid, cv=cv, \
                                scoring=scorer,
                                n_jobs=-1)
    grid_search.fit(X, y)
    if verbose:
        print('决策树最佳参数： ', grid_search.best_params_) 
    return grid_search.best_params_

def select_rf(X, y, scorer, cv=5, verbose=True):
    '''
    筛选随机森林的最佳参数
    '''
    grid = {'n_estimators':[5, 15, 25, 35, 45, 50, 65, 75, 85, 95]}
    grid_search = GridSearchCV(RandomForestRegressor(max_samples=0.67,\
                                max_features=0.33, max_depth=5), \
                                param_grid=grid, cv=cv,\
                                scoring=scorer,
                                n_jobs=-1)
    grid_search.fit(X, y)
    if verbose:
        print('随机森林最佳参数: ', grid_search.best_params_)
    return grid_search.best_params_


def select_ada(X, y, scorer, cv=5, verbose=True):
    '''
    筛选 AdaBoost 的最佳参数，其中基模型为逻辑回归模型
    '''
    grid = {'n_estimators':[5, 15, 25, 35, 45, 50, 65, 75, 85, 95]}
    grid_search = GridSearchCV(AdaBoostRegressor( \
                                base_estimator=LinearRegression()),\
                                param_grid=grid,
                                cv=cv,
                                scoring=scorer,
                                n_jobs=-1)
    
    
    grid_search.fit(X, y)
    if verbose:    
        print('AdaBoost 最佳参数： ', grid_search.best_params_)
    return grid_search.best_params_

def select_model(X, y, scorer, cv=5, verbose=True):
    '''
    将筛选参数整合为一个函数
    '''
    knn_param = select_knn(X, y, scorer, cv, verbose=verbose)
    svc_param = select_svr(X, y, scorer, cv, verbose=verbose)
    dtc_param = select_dtr(X, y, scorer, cv, verbose=verbose)
    rf_param = select_rf(X, y, scorer, cv, verbose=verbose)
    ada_param = select_ada(X, y, scorer, cv, verbose=verbose)
    return knn_param, svc_param, dtc_param, rf_param, ada_param


def cv_score(X, y, scorer, cv=5, verbose=True, \
            knn_param={'n_neighbors':3}, \
            svr_param={'C': 2.75, 'kernel': 'poly', 'epsilon':0.01},\
            dtr_param={'ccp_alpha':0, 'max_depth':13}, \
            rf_param={'n_estimators':15},\
            ada_param={'n_estimators':45}):
                
    """根据上述最优参数，构建模型"""
    lr = LinearRegression()
    knn = KNeighborsRegressor(n_neighbors=knn_param['n_neighbors'])
    svr = SVR(C=svr_param['C'], kernel=svr_param['kernel'],
              epsilon=svr_param['epsilon'])
              
    dtr = DecisionTreeRegressor(max_depth=dtr_param['max_depth'],
                                ccp_alpha=dtr_param['ccp_alpha'])
                                
    rf = RandomForestRegressor(n_estimators=rf_param['n_estimators'],\
                                max_samples=0.67,\
                                max_features=0.33, max_depth=5)
    ada = AdaBoostRegressor(base_estimator=lr,\
                            n_estimators=ada_param['n_estimators'])

    
    """用5折交叉验证，计算所有模型的 r2,并计算其均值"""
    S_lr_i = cross_val_score(lr, X, y, \
                            scoring=scorer,
                            cv=cv,
                            n_jobs=-1)   

    S_knn_i = cross_val_score(knn, X, y, \
                            scoring=scorer,
                            cv=cv,
                            n_jobs=-1)   
                            
    S_svr_i = cross_val_score(svr, X, y, \
                            scoring=scorer,
                            cv=cv,
                            n_jobs=-1)   
                            
    S_dtr_i = cross_val_score(dtr, X, y, \
                            scoring=scorer,
                            cv=cv,
                            n_jobs=-1)   
                            
    S_rf_i = cross_val_score(rf, X, y, \
                            scoring=scorer,
                            cv=cv,
                            n_jobs=-1)   
                            
    S_ada_i = cross_val_score(ada, X, y, \
                            scoring=scorer, 
                            cv=cv,
                            n_jobs=-1)   

    
    if verbose:
        print(f'lr mse 均值 ： {np.mean(S_lr_i)}')    
        print(f'knn mse 均值 ： {np.mean(S_knn_i)}')
        print(f'svr mse 均值 : {np.mean(S_svr_i)}')
        print(f'dtr mse 均值 :{np.mean(S_dtr_i)}')
        print(f'rf mse 均值 : {np.mean(S_rf_i)}')
        print(f'adaboost mse 均值 : {np.mean(S_ada_i)}')
          
        print('lr mse :', S_lr_i)
        print('knn mse :', S_knn_i)
        print('svr mse :', S_svr_i)
        print('dtr mse :', S_dtr_i)
        print('rf mse :', S_rf_i)
        print('adaboost mse ：', S_ada_i)
        
    return S_lr_i, S_knn_i, S_svr_i, S_dtr_i, S_rf_i, S_ada_i   

def return_machine_learning_model(X, y, scorer, cv=5, verbose=True):
    '''
    挑选模型参数、模型，并训练、评价模型
    scorer 为评价标准，文中采用 MSE
    cv 是交叉验证 折数，文中采用 5 折
    verbose 是输出结果
    '''
    # 筛选参数
    knn_param, svc_param, dtc_param, \
    rf_param, ada_param = select_model(X, y, 
                                        scorer, cv=cv, verbose=verbose)
    # 筛选模型
    S_lr_i, S_knn_i, S_svr_i, S_dtr_i, S_rf_i, S_ada_i = \
                                cv_score(X, y, scorer=scorer, cv=cv)
                                
    # 比较 lr 和 rf 模型
    t, p_value = ttest_ind(S_lr_i, S_rf_i)
    if p_value > 0.05:
        print('无法拒绝原假设，即认为 lr 和 rf 模型等价')
        
    # 按 7:3 拆分成 训练集 和 测试集
    X_train, X_test, y_train, y_test = train_test_split(
                            X, y, test_size=0.3, random_state=42)
                            
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_test_pred = lr.predict(X_test)
    y_train_pred = lr.predict(X_train)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    print('lr 模型在训练集、测试集的 MSE 分别为: ', mse_train, mse_test)
    return lr
    

if __name__ == '__main__':
    data = pickle.load(open(r'../附件/intermedia_data.pkl', 'rb'))

    # 将数据按有无 B-V 拆分成两部分
    X = data.loc[~data['B-V'].isna()]
    X_without_BV = data.loc[data['B-V'].isna()]
    
    # 将 B-V 作为 y
    y = X['B-V']
    X.drop('B-V', axis=1, inplace=True)
    
    scorer = make_scorer(mean_squared_error)
    
    # 筛选参数、模型并训练
    lr = return_machine_learning_model(X, y, scorer, cv=5, verbose=True)
    lr.fit(X, y)
    
    y_BV_predict = lr.predict(X_without_BV.iloc[:, :-1])
    X_without_BV.drop('B-V', axis=1, inplace=True)
    X_without_BV['B-V'] = y_BV_predict
    X['B-V'] = y
    
    X = pd.concat([X, X_without_BV], axis=0)
    file_save_path = r'../附件/data_ml.pkl'
    pickle.dump(X, open(file_save_path, 'wb'))
    
    
